import requests
import re

#按诗词大类进行爬取
def geturltnext(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    response = requests.get(url, headers)
    text = response.text
    return text

def nexturl(text):
    nexturl = re.findall(r'<span><a href="(.*?)" target',text,re.DOTALL) #下一条连接
    nexturls=[]
    for value in zip(nexturl):
        str="".join(value)
        # strurl ="https://so.gushiwen.cn/"+str
        strurl = str
        nexturls.append(strurl)
    count=1
    for nexturl  in nexturls:
        print(count)
        count+=1
        print(nexturl)
        print('---'*80)
    return nexturls

def parse_page(url):#得到古诗词内容
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
    }
    response = requests.get(url,headers)
    text = response.text
    title1 = re.findall(r'<h1 style=.*?">(.*?)</h1>',text,re.DOTALL)#得到题目
    titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>',text,re.DOTALL)#得到题目
    authors = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>',text,re.DOTALL)#得到作者
    dynasties = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>',text,re.DOTALL)#得到朝代
    contents_tags = re.findall(r'<div class="contson" .*?>(.*?)</div>',text,re.DOTALL)#得到内容
    contents = []
    for content in contents_tags:
        content = re.sub(r'<.*?>','',content)
        contents.append(content.strip())
    poems = []
    for value in zip(titles,authors,dynasties,contents):
        title,author,dynasty,content = value
        if(author==title):
            title="".join(title1)
        poem = [
            {
                'title':title,
                'authors':author,
                'dynasties':dynasty,
                'contents':content
            }
        ]
        poems.append(str(poem))
    for poem  in poems:
        print(poem)
        print('---'*80)
    return poems

def writeintext(poems,w):
    for poem in poems:
        strpoem = ''.join(poem)
        w.write(strpoem)
        w.write("\n")


def main():
    # urls=["https://so.gushiwen.cn/gushi/yuefu.aspx","https://so.gushiwen.cn/gushi/tangshi.aspx","https://so.gushiwen.cn/gushi/sanbai.aspx","https://so.gushiwen.cn/gushi/songsan.aspx"]
    # url="https://so.gushiwen.cn/gushi/yuefu.aspx"#乐府诗集
    # url="https://so.gushiwen.cn/gushi/tangshi.aspx"#唐诗三百
    # url="https://so.gushiwen.cn/gushi/sanbai.aspx"#古诗三百首
    # url="https://so.gushiwen.cn/gushi/songsan.aspx"#宋词三百
    # urls=["https://so.gushiwen.cn/gushi/xiaoxue.aspx"]#小学古诗文
    urls = ["https://so.gushiwen.cn/wenyan/chuwen.aspx",]#初中古诗文

    w = open('诗词.txt', 'a', encoding='utf-8')
    nexturls = []
    for url in urls:
        text=geturltnext(url)
        poems=[]
        nexturls=nexturl(text)
        for nurl in nexturls:
            poems=parse_page(nurl)
            writeintext(poems,w)
    w.close()
if __name__ == '__main__':
    main()